#currently running directatory
import os
os.getcwd()
'C:\\Users\\ADMIN'
#Importing all important package's for predictiond and calculation.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
#Importing the file to test and run the functions.
data= pd.read_excel('C:\\Users\\ADMIN\\1788408-1767133-1729258-1613615-Stock_Price_data_set.xlsx')
#Reading of the data.
data
| Date | Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|---|
| 0 | 2018-02-05 | 262.000000 | 267.899994 | 250.029999 | 254.259995 | 254.259995 | 11896100.0 |
| 1 | 2018-02-06 | 247.699997 | 266.700012 | 245.000000 | 265.720001 | 265.720001 | 12595800.0 |
| 2 | 2018-02-07 | 266.579987 | 272.450012 | 264.329987 | 264.559998 | 264.559998 | 8981500.0 |
| 3 | 2018-02-08 | 267.079987 | 267.619995 | 250.000000 | 250.100006 | 250.100006 | 9306700.0 |
| 4 | 2018-02-09 | 253.850006 | 255.800003 | 236.110001 | 249.470001 | 249.470001 | 16906900.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1004 | 2022-01-31 | 401.970001 | 427.700012 | 398.200012 | 427.140015 | 427.140015 | 20047500.0 |
| 1005 | 2022-02-01 | 432.959991 | 458.480011 | 425.540009 | 457.130005 | 457.130005 | 22542300.0 |
| 1006 | 2022-02-02 | 448.250000 | 451.980011 | 426.480011 | 429.480011 | 429.480011 | 14346000.0 |
| 1007 | 2022-02-03 | 421.440002 | 429.260010 | 404.279999 | 405.600006 | 405.600006 | 9905200.0 |
| 1008 | 2022-02-04 | 407.309998 | 412.769989 | 396.640015 | 410.170013 | 410.170013 | 7782400.0 |
1009 rows × 7 columns
#Reading the first 5 rows.
data.head()
| Date | Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|---|
| 0 | 2018-02-05 | 262.000000 | 267.899994 | 250.029999 | 254.259995 | 254.259995 | 11896100.0 |
| 1 | 2018-02-06 | 247.699997 | 266.700012 | 245.000000 | 265.720001 | 265.720001 | 12595800.0 |
| 2 | 2018-02-07 | 266.579987 | 272.450012 | 264.329987 | 264.559998 | 264.559998 | 8981500.0 |
| 3 | 2018-02-08 | 267.079987 | 267.619995 | 250.000000 | 250.100006 | 250.100006 | 9306700.0 |
| 4 | 2018-02-09 | 253.850006 | 255.800003 | 236.110001 | 249.470001 | 249.470001 | 16906900.0 |
#Reading the last 5 rows.
data.tail()
| Date | Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|---|
| 1004 | 2022-01-31 | 401.970001 | 427.700012 | 398.200012 | 427.140015 | 427.140015 | 20047500.0 |
| 1005 | 2022-02-01 | 432.959991 | 458.480011 | 425.540009 | 457.130005 | 457.130005 | 22542300.0 |
| 1006 | 2022-02-02 | 448.250000 | 451.980011 | 426.480011 | 429.480011 | 429.480011 | 14346000.0 |
| 1007 | 2022-02-03 | 421.440002 | 429.260010 | 404.279999 | 405.600006 | 405.600006 | 9905200.0 |
| 1008 | 2022-02-04 | 407.309998 | 412.769989 | 396.640015 | 410.170013 | 410.170013 | 7782400.0 |
#Describtion of the data.
data.describe
<bound method NDFrame.describe of Date Open High Low Close Adj Close \
0 2018-02-05 262.000000 267.899994 250.029999 254.259995 254.259995
1 2018-02-06 247.699997 266.700012 245.000000 265.720001 265.720001
2 2018-02-07 266.579987 272.450012 264.329987 264.559998 264.559998
3 2018-02-08 267.079987 267.619995 250.000000 250.100006 250.100006
4 2018-02-09 253.850006 255.800003 236.110001 249.470001 249.470001
... ... ... ... ... ... ...
1004 2022-01-31 401.970001 427.700012 398.200012 427.140015 427.140015
1005 2022-02-01 432.959991 458.480011 425.540009 457.130005 457.130005
1006 2022-02-02 448.250000 451.980011 426.480011 429.480011 429.480011
1007 2022-02-03 421.440002 429.260010 404.279999 405.600006 405.600006
1008 2022-02-04 407.309998 412.769989 396.640015 410.170013 410.170013
Volume
0 11896100.0
1 12595800.0
2 8981500.0
3 9306700.0
4 16906900.0
... ...
1004 20047500.0
1005 22542300.0
1006 14346000.0
1007 9905200.0
1008 7782400.0
[1009 rows x 7 columns]>
#reading the title or the heading of the data.
data.count()
Date 1009 Open 1009 High 1009 Low 1009 Close 1009 Adj Close 1009 Volume 1009 dtype: int64
data.index
RangeIndex(start=0, stop=1009, step=1)
data.columns
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')
#To show the completely data set info/describtion.
data.describe(include = 'all')
C:\Users\ADMIN\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now. """Entry point for launching an IPython kernel.
| Date | Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|---|
| count | 1009 | 1009.000000 | 1009.000000 | 1009.000000 | 1009.000000 | 1009.000000 | 1.009000e+03 |
| unique | 1009 | NaN | NaN | NaN | NaN | NaN | NaN |
| top | 2018-02-05 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | 1 | NaN | NaN | NaN | NaN | NaN | NaN |
| first | 2018-02-05 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN |
| last | 2022-02-04 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | NaN | 419.059673 | 425.320703 | 412.374044 | 419.000733 | 419.000733 | 7.570685e+06 |
| std | NaN | 108.537532 | 109.262960 | 107.555867 | 108.289999 | 108.289999 | 5.465535e+06 |
| min | NaN | 233.919998 | 250.649994 | 231.229996 | 233.880005 | 233.880005 | 1.144000e+06 |
| 25% | NaN | 331.489990 | 336.299988 | 326.000000 | 331.619995 | 331.619995 | 4.091900e+06 |
| 50% | NaN | 377.769989 | 383.010010 | 370.880005 | 378.670013 | 378.670013 | 5.934500e+06 |
| 75% | NaN | 509.130005 | 515.630005 | 502.529999 | 509.079987 | 509.079987 | 9.322400e+06 |
| max | NaN | 692.349976 | 700.989990 | 686.090027 | 691.690002 | 691.690002 | 5.890430e+07 |
#To show the datatypes of the data.
data.dtypes
Date datetime64[ns] Open float64 High float64 Low float64 Close float64 Adj Close float64 Volume float64 dtype: object
#To see the null value of the data.
data.isna().sum()
Date 0 Open 0 High 0 Low 0 Close 0 Adj Close 0 Volume 0 dtype: int64
#To see the null value in bool form.
data.isna().any()
Date False Open False High False Low False Close False Adj Close False Volume False dtype: bool
#To clean all the null value or to see the null values.
data.isnull().sum().sum()
0
data['Date'] = pd.to_datetime(data.Date)
data.drop('Adj Close',axis =1, inplace = True)
print(len(data))
1009
data['Open'].plot(figsize=(16,6))
<AxesSubplot:>
# Correlation matrix
def plot_corr_matrix(data, g_width):
file_name = datd.dataframeName
data = data.dropna('columns') # drop columns with NaN
data = data[[col for col in data if data[col].nunique() > 1]] # keep columns where there are more than 1 unique values
if data.shape[1] < 2:
print(f'No correlation plots shown: The number of non-NaN or constant columns ({data.shape[1]}) is less than 2')
return
corr = data.corr()
plt.figure(num=None, figsize=(g_width, g_width), dpi=80, facecolor='w', edgecolor='k')
corr_matrix = plt.matshow(corr, fignum = 1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.gca().xaxis.tick_bottom()
plt.colorbar(corr_matrix)
plt.title(f'Correlation Matrix for {file_name}', fontsize=15)
plt.show()
# Scatter and density plots
def plot_scatter_mat(data, plot_size, text_size):
data = data.select_dtypes(include =[np.number]) # keep only numerical columns
## drop nan values
data = data.dropna('columns')
data = data[[col for col in data if data[col].nunique() > 1]] # keep columns where there are more than 1 unique values
column_names = list(data)
if len(column_names) > 10:
column_names = column_names[:10]
data = data[column_names]
ax = pd.plotting.scatter_matrix(data, alpha=0.75, figsize=[plot_size, plot_size], diagonal='kde')
corr = data.corr().values
for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
ax[i, j].annotate('Corr. coef = %.3f' % corr[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=text_size)
plt.suptitle('Scatter and Density Plot')
plt.show()
plot_scatter_mat(data, 25, 20)
C:\Users\ADMIN\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: FutureWarning: In a future version of pandas all arguments of DataFrame.dropna will be keyword-only """
corr = data.corr()
plt.figure(num=25, figsize=(35, 45), dpi=80, facecolor='w', edgecolor='k')
<Figure size 2800x3600 with 0 Axes>
<Figure size 2800x3600 with 0 Axes>
plt.figure(num=25, figsize=(135, 145), dpi=80, facecolor='w', edgecolor='k')
corr_matrix = plt.matshow(corr, fignum = 1)
<Figure size 10800x11600 with 0 Axes>
corr = data.corr()
plt.figure(num= 25, figsize=(25, 20), dpi=80, facecolor='w', edgecolor='k')
corr_matrix = plt.matshow(corr, fignum = 1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.gca().xaxis.tick_bottom()
plt.colorbar(corr_matrix)
plt.title(f'Correlation Matrix for {data}', fontsize=10)
plt.show()
<Figure size 2000x1600 with 0 Axes>
x = data[['Open','High','Low','Volume']]
y = data['Close']
#To test and train the model.
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x ,y, random_state = 0)
x_train.shape
(756, 4)
x_test.shape
(253, 4)
y_train.shape
(756,)
y_test.shape
(253,)
#Ploting the graph of complete dataset.
import plotly.graph_objects as go
import plotly.express as px
figure = px.bar(data, x = 'Date', y = 'Close')
figure.show()
#You can expand the graph to see in more detailed valued of stock price
figure = px.line(data, x='Date' , y='Close', title='Stock price with rangeslider')
figure.update_xaxes(rangeslider_visible=True)
figure.show()
#Ployting all the bar graphs from date till volume of the stock price insteading of doing sperately each graph name is mentioned below it..
plt.figure(figsize=(15,10))
sns.histplot(data = data, x = 'Date', kde = True)
plt.figure(figsize=(15,10))
sns.histplot(data = data, x = 'Open', kde = True)
plt.figure(figsize=(15,10))
sns.histplot(data = data, x = 'High', kde = True)
plt.figure(figsize=(15,10))
sns.histplot(data = data, x = 'Low', kde = True)
plt.figure(figsize=(15,10))
sns.histplot(data = data, x = 'Close', kde = True)
plt.figure(figsize=(15,10))
sns.histplot(data = data, x = 'Volume', kde = True)
<AxesSubplot:xlabel='Volume', ylabel='Count'>
#To calculate the data using Linear Regression model.
regressor.fit(x_train,y_train)
LinearRegression()
print(regressor.coef_)
[-5.98637669e-01 7.42752459e-01 8.57948723e-01 9.68159262e-08]
print(regressor.intercept_)
-0.7077595574160114
#Prediction of the data.
predicted=regressor.predict(x_test)
print(x_test)
Open High Low Volume 801 557.000000 559.750000 550.299988 2720300.0 311 378.000000 383.500000 374.510010 5398200.0 85 368.540009 368.700012 357.799988 8278000.0 435 278.049988 285.750000 277.350006 6248400.0 204 260.549988 266.250000 253.800003 12498600.0 ... ... ... ... ... 583 418.829987 426.720001 415.980011 3743700.0 200 283.790009 285.089996 269.149994 12993800.0 767 525.000000 548.539978 518.280029 4136500.0 1000 379.140015 387.709991 365.130005 15145800.0 385 298.859985 303.549988 296.269989 6905800.0 [253 rows x 4 columns]
predicted.shape
(253,)
dframe=pd.DataFrame(y_test,predicted)
dfr=pd.DataFrame({'Actual':y_test,'Predicted':predicted})
print(dfr)
Actual Predicted 801 553.729980 553.999288 311 379.059998 379.685786 85 361.399994 360.298634 435 281.859985 283.639587 204 261.429993 260.032497 ... ... ... 583 425.920013 422.764132 200 270.600006 273.331047 767 546.150024 537.495050 1000 366.420013 375.026471 385 302.799988 300.698946 [253 rows x 2 columns]
dfr.head(10)
| Actual | Predicted | |
|---|---|---|
| 801 | 553.729980 | 553.999288 |
| 311 | 379.059998 | 379.685786 |
| 85 | 361.399994 | 360.298634 |
| 435 | 281.859985 | 283.639587 |
| 204 | 261.429993 | 260.032497 |
| 590 | 434.480011 | 434.730957 |
| 1 | 265.720001 | 260.518692 |
| 780 | 518.020020 | 518.570772 |
| 457 | 315.929993 | 315.400869 |
| 299 | 348.869995 | 345.455041 |
regressor.score(x_test,y_test)
0.9982601041694543
import math
print('Mean Absolute Error:',metrics.mean_absolute_error(y_test,predicted))
Mean Absolute Error: 3.124088127372266
print('Mean Squared Error:',metrics.mean_squared_error(y_test,predicted))
Mean Squared Error: 19.16361234146909
print('Root Mean Squared Error:',math.sqrt(metrics.mean_squared_error(y_test,predicted)))
Root Mean Squared Error: 4.377626336437258
graph=dfr.head(30)
#We can't plot all the rows and columns graph so we choose random 30 numbers to plot graph.
graph.plot(kind='bar')
<AxesSubplot:>